https://www.statmethods.net/advstats/glm.html
# Load R packages into the library
# Data management packages
library(DescTools)
library(skimr)
library(plyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(aod)
library(readxl)
# Visualization packages
#library(Deducer)
library(ggplot2)
# Machine learnning method packages
library(ROCR)
## Loading required package: gplots
## Registered S3 method overwritten by 'gdata':
## method from
## reorder.factor DescTools
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following objects are masked from 'package:DescTools':
##
## MAE, RMSE
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:MASS':
##
## select
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following objects are masked from 'package:plyr':
##
## arrange, mutate, rename, summarise
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
################# 12 oct 2019 #########
# Import dataset # lending Club
#https://www.kaggle.com/wendykan/lending-club-loan-data
#https://www.lendingclub.com/developers/listed-loans
setwd("~/Downloads/001.Analytics Adayar")
loan_data <- read.csv("loan.csv")
# Selecting the relevant variables in the dataset:
# gouti1454: need to study the CSV file. check the columns and find the relavent data neeed to work with the model.
# gouti1454 - here they have selected and given to practice.
loan_data <- loan_data[,c("grade","sub_grade","term","loan_amnt","issue_d","loan_status","emp_length",
"home_ownership", "annual_inc","verification_status","purpose","dti",
"delinq_2yrs","addr_state","int_rate", "inq_last_6mths","mths_since_last_delinq",
"mths_since_last_record","open_acc","pub_rec","revol_bal","revol_util","total_acc")]
table(loan_data$sub_grade)
##
## A1 A2 A3 A4 A5 B1 B2 B3 B4 B5 C1
## 86790 69562 73184 95874 107617 125341 126621 131514 139793 140288 145903
## C2 C3 C4 C5 D1 D2 D3 D4 D5 E1 E2
## 131116 129193 127115 116726 81787 72899 64819 56896 48023 33573 29924
## E3 E4 E5 F1 F2 F3 F4 F5 G1 G2 G3
## 26708 22763 22671 13413 9305 7791 6124 5167 4106 2688 2094
## G4 G5
## 1712 1568
# Data management for missing observations
loan_data$mths_since_last_delinq[is.na(loan_data$mths_since_last_delinq)] <- 0
loan_data$mths_since_last_record[is.na(loan_data$mths_since_last_record)] <- 0
var.has.na <- lapply(loan_data, function(x){any(is.na(x))})
#gouti1454- "which" here brings all the na values
num_na <- which( var.has.na == TRUE ) #gouti1454- "which" is the filter funtion in like excel. More likely to be a Pivot table conductions.
#gouti1454 trying to find percentage of NA; "per_na".
per_na <- num_na/dim(loan_data)[1]
loan_data <- loan_data[complete.cases(loan_data),]
# Visualization of the data
# Bar chart of the loan amount
#gouti1454 ploting graph -loan amount vs nos of loans.
loanamount_barchart <- ggplot(data=loan_data, aes(loan_data$loan_amnt)) +
geom_histogram(breaks=seq(0, 35000, by=1000),
col="black", aes(fill=..count..)) +
scale_fill_gradient("Count", low="green1", high="yellowgreen")+
labs(title="Loan Amount", x="Amount", y="Number of Loans")
loanamount_barchart
ggplotly(p = ggplot2::last_plot())
# Box plot of loan amount
#gouti1454 below lines will give the box plots on loan status vs loan amounts.
# gouti1454 you can find the outlayers also.
box_plot_stat <- ggplot(loan_data, aes(loan_status, loan_amnt))
box_plot_stat + geom_boxplot(aes(fill = loan_status)) +
theme(axis.text.x = element_blank()) +
labs(list(title = "Loan amount by status", x = "Loan Status", y = "Amount"))
ggplotly(p = ggplot2::last_plot())